# ANES_coding_yearsOfSchooling.R

# Part of the replication archive for 
#
#   Bullock, John G. 2020. "Education and Attitudes toward Redistribution in
#   the United States." British Journal of Political Science 50.


# The cumulative ANES only has coarse measures of years of education, e.g., 
# "more than 8 years of education but no high school diploma." To create a 
# finer measure, I import the finer education variables that are available in  
# many of the year-specific ANES studies, e.g., the 1982 ANES.  

ANES_codeYearsOfSchooling <- function () {

  require(car)      # for Recode()
  require(dplyr)    # for bind_rows(), %>%
  require(haven)    # for read_dta(), read_spss()
  require(stringr)  # for str_pad()
  
  
  ############################################################################
  # DOWNLOAD ANES TIME-SERIES DATASETS
  ############################################################################
  
  # CREATE TEMPORARY FILES
  years <- seq(1970, 2008, by = 2)
  years <- years[-which(years == 2006)]
  for (y in years) {
    assign(paste0("ANES_", y), tempfile(fileext = ".zip"))
  }
  
  # DOWNLOAD AND IMPORT THE DATASETS
  # The code below downloads the cumulative ANES from its home at 
  # https://electionstudies.org. The ANES does not provide permanent links 
  # (for example, DOI-based links) to any of its files, and the links included  
  # in the code below will stop working when the ANES changes the URLs. When  
  # that happens, you need only update the link in the download.file() command  
  # to correct the problem. Of course, you can also download the ANES files   
  # "by hand" and replace the code below with commands which load the datasets
  # that you've downloaded.  [2019 07 14]
  download.file(
    url      = 'https://electionstudies.org/wp-content/uploads/2018/06/anes1970dta.zip', 
    destfile = ANES_1970)
  ANES1970 <- unz(ANES_1970, 'NES1970.dta') %>% 
    read_dta()

  download.file(
    url      = 'https://electionstudies.org/wp-content/uploads/2018/06/anes1972dta.zip', 
    destfile = ANES_1972)
  ANES1972 <- unz(ANES_1972, 'NES1972.dta') %>% 
    read_dta()

  download.file(
    url      = 'https://electionstudies.org/wp-content/uploads/2018/07/anes1974dta.zip', 
    destfile = ANES_1974)
  ANES1974 <- unz(ANES_1974, 'NES1974.dta') %>% 
    read_dta()
  
  download.file(
    url      = 'https://electionstudies.org/wp-content/uploads/2018/06/anes1976dta.zip', 
    destfile = ANES_1976)
  ANES1976 <- unz(ANES_1976, 'NES1976.dta') %>% 
    read_dta()
  
  download.file(
    url      = 'https://electionstudies.org/wp-content/uploads/2018/06/anes1978dta.zip', 
    destfile = ANES_1978)
  ANES1978 <- unz(ANES_1978, 'nes1978.dta') %>% 
    read_dta()
  
  download.file(
    url      = 'https://electionstudies.org/wp-content/uploads/2018/06/anes1980dta.zip', 
    destfile = ANES_1980)
  ANES1980 <- unz(ANES_1980, 'NES1980.dta') %>% 
    read_dta()

  download.file(
    url      = 'https://electionstudies.org/wp-content/uploads/2018/06/anes1982dta.zip', 
    destfile = ANES_1982)
  ANES1982 <- unz(ANES_1982, 'NES1982.dta') %>% 
    read_dta()

  download.file(
    url      = 'https://electionstudies.org/wp-content/uploads/2018/06/anes1984dta.zip', 
    destfile = ANES_1984)
  ANES1984 <- unz(ANES_1984, 'NES1984.dta') %>% 
    read_dta()

  download.file(
    url      = 'https://electionstudies.org/wp-content/uploads/2018/06/anes1986dta.zip', 
    destfile = ANES_1986)
  ANES1986 <- unz(ANES_1986, 'nes1986.dta') %>% 
    read_dta()
  
  download.file(
    url      = 'https://electionstudies.org/wp-content/uploads/2018/06/anes1988dta.zip', 
    destfile = ANES_1988)
  ANES1988 <- unz(ANES_1988, 'NES1988.dta') %>% 
    read_dta()

  download.file(
    url      = 'https://electionstudies.org/wp-content/uploads/2018/06/anes1990dta.zip', 
    destfile = ANES_1990)
  ANES1990 <- unz(ANES_1990, 'NES1990.dta') %>% 
    read_dta()
  
  download.file(
    url      = 'https://electionstudies.org/wp-content/uploads/2018/06/anes1992dta.zip', 
    destfile = ANES_1992)
  ANES1992 <- unz(ANES_1992, 'NES1992.dta') %>% 
    read_dta()
  
  download.file(
    url      = 'https://electionstudies.org/wp-content/uploads/2018/06/anes1994dta.zip', 
    destfile = ANES_1994)
  ANES1994 <- unz(ANES_1994, 'NES1994.dta') %>% 
    read_dta()

  download.file(
    url      = 'https://electionstudies.org/wp-content/uploads/2018/06/anes1996dta.zip', 
    destfile = ANES_1996)
  ANES1996 <- unz(ANES_1996, 'nes96.dta') %>% 
    read_dta()
  
  download.file(
    url      = 'https://electionstudies.org/wp-content/uploads/2018/06/anes1998dta.zip', 
    destfile = ANES_1998)
  ANES1998 <- unz(ANES_1998, 'nes1998.dta') %>% 
    read_dta()

  download.file(
    url      = 'https://electionstudies.org/wp-content/uploads/2018/06/anes2000TSdta.zip', 
    destfile = ANES_2000)
  ANES2000 <- unz(ANES_2000, 'anes2000TS.dta') %>% 
    read_dta()
  
  download.file(
    url      = 'https://electionstudies.org/wp-content/uploads/2018/06/anes2002TSdta.zip', 
    destfile = ANES_2002)
  ANES2002 <- unz(ANES_2002, 'anes2002TS.dta') %>% 
    read_dta()
  
  download.file(
    url      = 'https://electionstudies.org/wp-content/uploads/2018/06/anes2004TSdta.zip', 
    destfile = ANES_2004)
  ANES2004 <- unz(ANES_2004, 'anes2004TS.dta') %>% 
    read_dta()
  
  # 2006 -- there was no ANES time-series study in 2006.  

  download.file(
    url      = 'https://electionstudies.org/wp-content/uploads/2018/06/anes_timeseries_2008_dta.zip', 
    destfile = ANES_2008)
  ANES2008 <- unz(ANES_2008, 'anes_timeseries_2008_stata12.dta') %>% 
    read_dta()

  
  
  ############################################################################
  # CODE YEARS OF SCHOOLING
  ############################################################################
  # 1970 VARIABLES
  # There are 1694 cases in the 1970 time-series study, but only 1507 of them 
  # made it into the cumulative ANES. The main reason, by far, is that 
  # respondents from the "Negro supplement" -- those with interview numbers of
  # 2000 or above (ID.1970 >= 19702000) -- are not in the cumulative ANES.
  ID.1970             <- str_pad(ANES1970$V700002, width=4, side='left', pad='0') %>%
    paste0('1970', .) %>%
    as.integer(.)
  educ.yrs.1970 <- Recode(                                                                                                                               
    var     = ANES1970$V700269,                                                                                                                          
    recodes = '11=1; 12=2; 13=3; 14=4; 15=5; 16=6; 17=7; 18=7; c(21,22)=8; c(31,41)=9; c(32,42)=10; c(33,43)=11; c(51,61)=12; 71=13; 81=16; 82:86=17; else=NA') %>%
    as.integer()
  tmp.df.1970         <- data.frame(
    ID.unique=ID.1970, 
    educ.yrs=educ.yrs.1970)
  
  
  # 1972 VARIABLES
  ID.1972             <- str_pad(ANES1972$V720002, width=4, side='left', pad='0') %>%
    paste0('1972', .) %>%
    as.integer(.)
  educ.yrs.1972 <- Recode(                                                                                                                               
    var     = ANES1972$V720300,                                                                                                                          
    recodes = '11=1; 12=2; 13=3; 14=4; 15=5; 16=6; 17=7; 18=7; c(21,22)=8; c(31,41)=9; c(32,42)=10; c(33,43)=11; 50=11; c(51,61)=12; 71=13; 81=16; 82:86=17; else=NA') %>%
    as.integer()
  tmp.df.1972         <- data.frame(
    ID.unique=ID.1972, 
    educ.yrs=educ.yrs.1972)
  
  
  # 1974 VARIABLES
  ID.1974             <- str_pad(ANES1974$V742002, width=4, side='left', pad='0') %>%
    paste0('1974', .) %>%
    as.integer(.)
  educ.yrs.1974 <- Recode(ANES1974$V742418, '98:99=NA') %>% as.integer()
  tmp.df.1974         <- data.frame(
    ID.unique=ID.1974, 
    educ.yrs=educ.yrs.1974)
  
  
  # 1976 VARIABLES
  ID.1976             <- str_pad(ANES1976$V763002, width=4, side='left', pad='0') %>%
    paste0('1976', .) %>%
    as.integer(.)
  educ.yrs.1976       <- Recode(ANES1976$V763384, '98:99=NA') %>% as.integer()
  tmp.df.1976         <- data.frame(
    ID.unique=ID.1976, 
    educ.yrs=educ.yrs.1976)
  
  
  # 1978 VARIABLES
  ID.1978             <- str_pad(ANES1978$V780002, width=4, side='left', pad='0') %>%
    paste0('1978', .) %>%
    as.integer(.)
  educ.yrs.1978       <- Recode(ANES1978$V780506, '98:99=NA') %>% as.integer()
  tmp.df.1978         <- data.frame(
    ID.unique=ID.1978, 
    educ.yrs=educ.yrs.1978)
  
  
  # 1980 VARIABLES
  ID.1980             <- as.integer(paste('1980', ANES1980$V800004, sep=''))
  ID.1980             <- str_pad(ANES1980$V800004, width=4, side='left', pad='0') %>%
    paste0('1980', .) %>%
    as.integer(.)
  educ.yrs.1980       <- Recode(ANES1980$V800429, '98:99=NA') %>% as.integer  # What is the highest grade of school or year of college that you completed?
  tmp.df.1980         <- data.frame(
    ID.unique=ID.1980, 
    educ.yrs=educ.yrs.1980)
  
  
  # 1982 VARIABLES  
  ID.1982             <- str_pad(ANES1982$V820004, width=4, side='left', pad='0') %>%
    paste0('1982', .) %>%
    as.integer(.)
  educ.yrs.1982       <- Recode(ANES1982$V820537, '98:99=NA') %>% as.integer  
  tmp.df.1982         <- data.frame(
    ID.unique=ID.1982, 
    educ.yrs=educ.yrs.1982)
  
  
  # 1984 VARIABLES
  # On years of schooling, see the one-page technical note at 
  # ftp://ftp.electionstudies.org/ftp/nes/bibliography/documents/nes010131.pdf.
  attach(ANES1984)
    ID.1984             <- str_pad(V840004, width=4, side='left', pad='0') %>% 
      paste0('1984', ., sep='') %>%
      as.integer(.)
    educ.yrs.1984       <- Recode(V840431, '98:99=NA') %>% as.integer()
  detach(ANES1984)
  tmp.df.1984 <- data.frame(
    ID.unique     = ID.1984, 
    educ.yrs      = educ.yrs.1984)
  
  
  # 1986 VARIABLES
  attach(ANES1986)
    ID.1986             <- str_pad(V860004, width=4, side='left', pad='0') 
    ID.1986             <- as.integer(paste('1986', ID.1986, sep='')) 
    educ.yrs.1986       <- Recode(V860599, '98:99=NA') %>% as.integer()
  detach(ANES1986)
  tmp.df.1986           <- data.frame(
    ID.unique=ID.1986, 
    educ.yrs=educ.yrs.1986)
  
  
  # 1988 VARIABLES
  attach(ANES1988)
    ID.1988             <- str_pad(V880004, width=4, side='left', pad='0') 
    ID.1988             <- as.integer(paste('1988', ID.1988, sep='')) 
    educ.yrs.1988       <- Recode(V880419, '98:99=NA') %>% as.integer()
  detach(ANES1988)
  tmp.df.1988           <- data.frame(
    ID.unique     = ID.1988, 
    educ.yrs      = educ.yrs.1988)
  
  
  # 1990 VARIABLES
  attach(ANES1990)
    ID.1990             <- str_pad(V900004, width=4, side='left', pad='0') 
    ID.1990             <- as.integer(paste('1990', ID.1990, sep='')) 
    educ.yrs.1990       <- Recode(V900554, '98:99=NA') %>% as.integer()
  detach(ANES1990)
  levels(educ.yrs.1990) <- gsub('^(.*?)[, ].*', '\\1', levels(educ.yrs.1990))  # strip all but the first word
  educ.yrs.1990         <- Recode(educ.yrs.1990, '"NONE"=0; "One"=1; "Two"=2; "Three"=3; 
                                                  "Four"=4; "Five"=5; "Six"=6; 
                                                  "Seven"=7; "Eight"=8; "Nine"=9; 
                                                  "Ten"=10; "Eleven"=11; "Twelve"=12; 
                                                  "Thirteen"=13; "Fourteen"=14; 
                                                  "Fifteen"=15; "Sixteen"=16; 
                                                  "Seventeen"=17; c("INAP", "DK", "NA", "99", "99.")=NA', 
                                  as.factor=FALSE)
  tmp.df.1990           <- data.frame(
    ID.unique=ID.1990, 
    educ.yrs=educ.yrs.1990)
  
  
  # 1992 VARIABLES
  attach(ANES1992)
    ID.1992             <- str_pad(V923004, width=4, side='left', pad='0') 
    ID.1992             <- as.integer(paste('1992', ID.1992, sep='')) 
    educ.yrs.1992       <- Recode(V923905, '96:99=NA') %>% as.integer() 
  detach(ANES1992)
  levels(educ.yrs.1992) <- gsub('^(.*?)[, ].*', '\\1', levels(educ.yrs.1992))  # strip all but the first word
  educ.yrs.1992       <- Recode(educ.yrs.1992, '"NONE"=0; "One"=1; "Two"=2; "Three"=3; 
                                          "Four"=4; "Five"=5; "Six"=6; 
                                          "Seven"=7; "Eight"=8; "Nine"=9; 
                                          "Ten"=10; "Eleven"=11; "Twelve"=12; 
                                          "Thirteen"=13; "Fourteen"=14; 
                                          "Fifteen"=15; "Sixteen"=16; 
                                          "Seventeen"=17; c("INAP", "DK", "NA")=NA', 
                                          as.factor=FALSE)
  tmp.df.1992           <- data.frame(
    ID.unique=ID.1992, 
    educ.yrs=educ.yrs.1992)
  
  
  # 1994 VARIABLES
  # 759 subjects are missing "educ.yrs" data in this year.  They are panel 
  # respondents who weren't asked this question, but whose answers may be in 
  # the 1992 time-series study.
  attach(ANES1994)
    ID.1994             <- as.integer(paste('1994', V940001, sep='')) 
    educ.yrs.1994       <- Recode(V941206, '96:99=NA') %>% as.integer()  
  detach(ANES1994)
  tmp.df.1994           <- data.frame(
    ID.unique=ID.1994, 
    educ.yrs=educ.yrs.1994)
  
  
  # 1996 VARIABLES
  attach(ANES1996)
    ID.1996             <- as.integer(paste('1996', V960001, sep='')) 
    educ.yrs.1996       <- Recode(V960607, '96:99=NA') %>% as.integer()
  detach(ANES1996)
  rm(ANES1996)
  tmp.df.1996           <- data.frame(
    ID.unique=ID.1996, 
    educ.yrs=educ.yrs.1996)
  
  
  # 1998 VARIABLES
  attach(ANES1998)
    ID.1998             <- str_pad(V980001, width=4, side='left', pad='0') 
    ID.1998             <- as.integer(paste('1998', ID.1998, sep='')) 
    educ.yrs.1998       <- Recode(V980574, '98:99=NA') %>% as.integer()
  detach(ANES1998)
  tmp.df.1998           <- data.frame(
    ID.unique=ID.1998, 
    educ.yrs=educ.yrs.1998)
  
  
  # 2000 VARIABLES
  attach(ANES2000)
    ID.2000             <- str_pad(V000001, width=4, side='left', pad='0') 
    ID.2000             <- as.integer(paste('2000', ID.2000, sep='')) 
    educ.yrs.2000       <- Recode(V000910, '98:99=NA') %>% as.integer()
  detach(ANES2000)
  tmp.df.2000           <- data.frame(
    ID.unique=ID.2000, 
    educ.yrs=educ.yrs.2000)
  
  
  # 2002 VARIABLES
  # Few people have educ.yrs values in 2002 because most respondents in 2002 
  # were panel respondents, and they had answered the question in 2000.  
  attach(ANES2002)
    ID.2002             <- str_pad(V020001, width=4, side='left', pad='0') 
    ID.2002             <- as.integer(paste('2002', ID.2002, sep='')) 
    educ.yrs.2002       <- Recode(V023128, '88:99=NA') %>% as.integer()
  detach(ANES2002)
  rm(ANES2002)
  tmp.df.2002           <- data.frame(
    ID.unique=ID.2002, 
    educ.yrs=educ.yrs.2002)
  
  
  # 2004 VARIABLES
  attach(ANES2004)
    ID.2004             <- str_pad(V040001, width=4, side='left', pad='0') 
    ID.2004             <- as.integer(paste('2004', ID.2004, sep='')) 
    educ.yrs.2004       <- Recode(V043252, '88:99=NA') %>% as.integer()
  detach(ANES2004)
  tmp.df.2004           <- data.frame(
    ID.unique=ID.2004, 
    educ.yrs=educ.yrs.2004)
    
  
  # 2008 VARIABLES
  ID.2008             <- str_pad(ANES2008$V080001, width=4, side='left', pad='0') 
  ID.2008             <- as.integer(paste('2008', ID.2008, sep='')) 
  educ.yrs.2008       <- Recode(ANES2008$V083217, 'c(-9, -8, 99)=NA') %>% as.integer()
  tmp.df.2008 <- data.frame(
    ID.unique    = ID.2008, 
    educ.yrs     = educ.yrs.2008)
  
  
  # RETURN A DATA FRAME
  yearDFnames         <- ls(pat = '^tmp.df.\\d{4}$')
  yearDFsCombinedList <- mget(yearDFnames)  
  yearDFsCombinedDF   <- bind_rows(yearDFsCombinedList) %>%
    rename(educYearsUncensored = "educ.yrs")
  yearDFsCombinedDF
}